---
title: "Riesgo de default en tarjetas"
description: "Ejercicio inicial flexdashboard"
output:
flexdashboard::flex_dashboard:
orientation: rows
vertical_layout: fill
source_code: embed
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(message = FALSE, warning = FALSE, echo = FALSE)
```
```{r packages-data}
library(flexdashboard)
library(shiny)
library(tidyverse)
library(ISLR)
library(scales)
library(viridisLite)
library(knitr)
library(pROC)
library(kableExtra)
#Datos y procesamiento
data("Default")
df <- Default %>%
as_tibble() %>%
mutate(
default = default == "Yes",
student = factor(student, levels = c("No","Yes")),
balance = as.numeric(balance),
income = as.numeric(income)
)
#KPI's
kpi_tasa <- mean(df$default)
kpi_n <- nrow(df)
kpi_balance <- mean(df$balance)
#Colorea KPI de acuerdo a umbral
kpi_color <- if (kpi_tasa >= 0.05) "danger" else if (kpi_tasa >= 0.02) "warning" else "success"
# Deciles de balance (línea guía)
by_bal <- df %>%
mutate(balance_dec = cut_number(balance, 10)) %>%
group_by(balance_dec) %>%
summarise(tasa = mean(default), .groups = "drop") %>%
mutate(idx = row_number())
#Segmentos y modelo
seg <- df %>% group_by(student) %>% summarise(tasa = mean(default), n= n(), .groups="drop")
mod <- glm(default ~ balance + income + student, data = df, family = binomial)
preds <- predict(mod, type = "response")
# Curvas por estatus manteniendo ingreso en mediana
newd_no <- tibble(
balance = seq(min(df$balance), max(df$balance), length.out = 300),
income = median(df$income),
student = factor("No", levels = levels(df$student))
)
newd_yes <- newd_no %>% mutate(student = factor("Yes", levels = levels(df$student)))
newd_no <- newd_no %>% mutate(prob = predict(mod, newdata = newd_no, type = "response"),
grupo = "No estudiante")
newd_yes <- newd_yes %>% mutate(prob = predict(mod, newdata = newd_yes, type = "response"),
grupo = "Estudiante")
curvas <- bind_rows(newd_no, newd_yes)
# Muestra ligera para dispersión (si la usas en la pestaña 2)
set.seed(1)
df_sample <- df %>% sample_n(2000)
# Tabla top 20 por probabilidad estimada
risk_tbl <- df %>%
mutate(prob_modelo = preds) %>%
arrange(desc(prob_modelo)) %>%
transmute(
default = if_else(default, "Sí", "No"),
student, balance, income,
`Prob. modelo` = percent(prob_modelo, accuracy = 0.1)
) %>%
head(20)
# --- Objetos para ROC / Calibración / Gains-Lift ---
roc_obj <- pROC::roc(df$default, preds, quiet = TRUE)
auc_val <- as.numeric(roc_obj$auc)
roc_df <- tibble(tpr = roc_obj$sensitivities, fpr = 1 - roc_obj$specificities)
# Umbral "óptimo" por Youden (docente; en práctica se fija por negocio)
thr_best <- as.numeric(coords(roc_obj, "best", best.method = "youden", ret = "threshold"))
# Matriz de confusión en ese umbral
pred_class <- preds >= thr_best
tp <- sum(pred_class & df$default)
fp <- sum(pred_class & !df$default)
tn <- sum(!pred_class & !df$default)
fn <- sum(!pred_class & df$default)
acc <- (tp + tn) / (tp + tn + fp + fn)
prec <- ifelse(tp + fp == 0, NA, tp / (tp + fp))
rec <- ifelse(tp + fn == 0, NA, tp / (tp + fn))
spec <- ifelse(tn + fp == 0, NA, tn / (tn + fp))
f1 <- ifelse(is.na(prec) | is.na(rec) | (prec + rec) == 0, NA, 2 * prec * rec / (prec + rec))
metrics_tbl <- tibble(
Métrica = c("AUC", "Umbral (Youden)", "Accuracy", "Precision", "Recall (TPR)", "Specificity (TNR)", "F1"),
Valor = c(auc_val, thr_best, acc, prec, rec, spec, f1)
) %>%
mutate(Valor = ifelse(Métrica %in% c("AUC","Accuracy","Precision","Recall (TPR)","Specificity (TNR)","F1"),
percent(Valor, accuracy = 0.1), scales::number(Valor, accuracy = 0.001)))
# Calibración (10 bins por score)
calib <- tibble(pred = preds, y = df$default) %>%
mutate(bin = ntile(pred, 10)) %>%
group_by(bin) %>%
summarise(
pred_media = mean(pred),
tasa_obs = mean(y),
n = n(), .groups = "drop"
)
```
Resumen ejecutivo
========================
Column {.sidebar}
----------------------------------------------------------------------
### Guía rápida
Conjunto de datos simulados que contiene 10,000 observaciones de usuarios de TDC
**Datos**: `ISLR::Default` (10k clientes).
**Variables**:
- `default` (Yes/No): si la persona cayó en inclumplimiento
- `student` (Yes/No): estatus de estudiante
- `balance` (numérica): balance promedio de la TDC (USD)
- `income` (numérica): ingreso anual (USD)
Row {data-height=200}
----------------------------------------------------------------------
###
```{r}
valueBox(value=percent(kpi_tasa, accuracy=0.1),caption ='Tasa de default', icon="fa-exclamation-triangle", color = kpi_color)
```
###
```{r}
valueBox(value = comma(kpi_n),caption = "Observaciones", icon="fa-users", color = "primary")
```
###
```{r}
valueBox(value = dollar(kpi_balance),caption = "Balance promedio", icon="fa-credit-card", color = "info")
```
Row {data-height=600}
-----------------------------------------------------------------------
### Default por estatus de estudiante
```{r}
seg %>%
ggplot(aes(student,tasa, fill = student)) +
geom_col() +
scale_fill_manual(values = viridisLite::viridis(2), guide="none")+
geom_text(aes(label = percent(tasa,accuracy=0.1)), vjust = -0-3, size = 4) +
scale_y_continuous(labels = percent, limits = c(0,NA))+
labs(x=NULL, y="Tasa de default", caption = "Fuente: ISLR::Default (n=10,000)")+
theme_minimal(base_size = 13)
```
### Deciles de balance (resumen)
```{r}
last_pt <- by_bal %>% filter(idx == max(idx))
ggplot(by_bal, aes(as.numeric(balance_dec), tasa, color = tasa, group = 1)) +
geom_line() +
geom_point() +
scale_color_viridis_c()+
geom_point( data = last_pt, aes(as.numeric(balance_dec), tasa)) +
scale_y_continuous(labels = scales::percent, limits = c(0,NA)) +
scale_x_continuous(breaks = c(1,5,10), labels = c("Q1", "Q5", "Q10")) +
labs(x="Deciles de balance (Q1-Q10)", y = "Tasa de default", caption ="A mayor balance, suele aumentar la tasa de default") +
theme_minimal(base_size = 13)+
theme(panel.grid.minor = element_blank())
```
Row {.tabset data-height=200}
-------------------------------------------------------------------------
### Notas de lectura
- **Balance** es el principal driver del riesgo
- El efecto de **student** puede cambiar al controlar por balance (ojo con Simpson).
- Los KPI's dan contexto para detectar outliers en los gráficos
### Conversiones de color
- Paleta **viridis** (daltónica) para categorías.
- El **value box** de tasa usa semaforo didáctico: `sucess`(<2%), `warning` (2-5%), `danger` (>5%).
Patrones y modelo
=============
#### Exploración del modelo {.tabset .tabset-fade}
Row
--------------------------------------------------------------------
### Curva ROC y AUC
```{r}
ggplot(roc_df, aes(fpr, tpr)) +
geom_abline(slope = 1, intercept = 0, linetype = "dotted") +
geom_path(size = 1, alpha = 0.9) +
coord_equal() +
labs(x = "FPR (1 - Especificidad)", y = "TPR (Sensibilidad)",
caption = paste0("AUC = ", scales::percent(auc_val, accuracy = 0.1))) +
theme_minimal(base_size = 13)
```
### Calibración (observado vs. estimado)
```{r}
ggplot(calib, aes(pred_media, tasa_obs)) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
geom_point(size = 2) +
geom_line() +
scale_x_continuous(labels = percent, limits = c(0, NA)) +
scale_y_continuous(labels = percent, limits = c(0, NA)) +
labs(x = "Prob. estimada (media por bin)",
y = "Tasa observada (por bin)",
caption = "10 bins por score — ideal ≈ línea 45°") +
theme_minimal(base_size = 13)
```
### Probabilidad de default vs. balance (ingreso mediano)
```{r}
ggplot(df_sample, aes(balance, as.numeric(default))) +
geom_jitter(height= 0.02, alpha = 0.1) +
geom_line(data = curvas, aes(balance, prob, linetype = grupo), linewidth = 1) +
scale_y_continuous(labels = percent, limits = c(0,1)) +
labs(x = "Balance (USD)",
y= "Probabilidad de default (estimada)",
linetype = "Grupo",
caption = "Línea continua: No estudiante; Discontinua: Estudiante") +
theme_minimal(base_size = 13)
```
Row {data-height=300}
--------------------------------------------------------------------
### Top 5 por riesgo estimado
```{r}
risk_tbl %>%
mutate(
Balance = scales::dollar(balance),
Ingreso = scales::dollar(income)
) %>%
transmute(
`Default` = default,
`Estudiante` = student,
`Balance` = Balance,
`Ingreso` = Ingreso,
`Prob. modelo`= `Prob. modelo`
) %>%
kable("html", align=c("c", "c", "r", "r","r"), escape=FALSE) %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "condensed"))%>%
column_spec(1, width = "5em") %>%
column_spec(1, width = "5em") %>%
column_spec(1, width = "10em") %>%
column_spec(1, width = "10em") %>%
column_spec(1, width = "10em")
```
### Métricas y umbral (Youden)
```{r}
kable(metrics_tbl, "html", align=c("l", "r"),
col.names= c("Métrica", "Valor"), escape=FALSE) %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "condensed")) %>%
column_spec(1, width = "16em") %>%
column_spec(2, width = "12em")
```
Row {data-height=150}
-------------------------------------------------------------------------------
### Notas finales
- **Propósito docente**: comunicar patrón principal con un GLM simple
- **Limitaciones**: pocas variables; no usar para decisiones reales